knitr::opts_chunk$set(echo = T)
library(catalogueR)
library(dplyr)
root <- getwd()
hpc_root <- "/sc/arion/projects/pd-omics/brian/COVID19_PheWAS"Previously we identified significant eQTL that showed overlap with significant COVID pheWAS hits.
Here, we aim to more robustly test whether the genetic signals underlying these simple overlaps are indeed the same, using a methodology called colocalization.
Specifically, we will use coloc::coloc.abf, which uses Approximate Bayes Factor to infer the probability that each SNP is causal in a given locus in each of the datasets (eQTL and pheWAS). It then tests the hypothesis that those signals show substantially similar association distributions. ## Prepare data
First, let’s import the COVID19 GWAS summary statistic and split them into locus folders
proxy_snps <- readxl::read_excel(file.path(root,"data/GCST90000255and6_GRCh38_Plessthan1eMinus5_combined_pruned.xlsx")) %>%
dplyr::mutate(proxy=gsub(":","-",variant_id)) %>%
data.table::data.table()%>%
# eQTL Catalogue does not have data on chr23 (X)
subset(chromosome!=23)In particular, we’re interested in several loci. Let’s prioritize those.
## This approach only leaves X-chrom loci, which are not available in eQTL Catalogue.
# highlight_snps <- readxl::read_excel(file.path(root,"data/GCST90000255and6_GRCh38_Plessthan1eMinus5_combined_pruned.xlsx"), sheet = 3) %>%
# subset(highlight=="yellow")
# proxy_snps <- proxy_snps[proxy_snps$variant_id %in% highlight_snps$variant_id]
GWAS.QTL <- data.table::fread(file.path(root,"pheWAS_queries/eQTL_Catalogue.annotated.tsv.gz"))
GWAS.QTL <- subset(GWAS.QTL, FDR.QTL<.05)
proxy_snps <- subset(proxy_snps, variant_id %in% GWAS.QTL$variant_id)
createDT(proxy_snps)The original authors of the COVID19 GWAS don’t provide the complete summary statistics. They are missing:
MAF: Minor Allele Frequency of each SNP. Nor do they provide the frequency of one of the allele, so that MAF could be inferred. Instead, we must borrow MAF from the eQTL Catalogue datasets and assume that they are similar enough to the GWAS (not always a valid assumption but can be used).
N: Per-SNP sample size. Sample size can vary from across SNPs within a study due to QC steps, so it’s best to have per-SNP sample size. Effective sample size takes into account the proportion of cases vs. controls in your sample, and is even better. Here, we have neither and thus are forced to assume that all SNPs have the same effective sample size (based on the total number of participants in the GWAS). > “835 patients and 1255 control participants from Italy and 775 patients and 950 control participants from Spain were included in the final analysis.”
N_cases <- 835 + 775
N_controls <- 1255 + 950
# N is effective sample size
N <- round(4.0 / (1.0/N_cases + 1.0/N_controls), digits = 0)DAT <- data.table::fread(file.path(root,"data/GCST90000256_GRCh38.tsv.gz"),
nThread = 10) %>%
dplyr::rename(CHR=chromosome,
POS=base_pair_location,
A1=effect_allele,
A2=other_allele,
Effect=beta,
StdErr=standard_error,
P=p_value) %>%
dplyr::mutate(variant_id=gsub(":","-",variant_id))
# DAT$N <- N
createDT(head(DAT))catalogueR::rsids_from_coords().bp_distance <- 1000000 # 1Mb windows
force_new_subset <- F
sumstats_paths <- lapply(1:nrow(proxy_snps), function(i){
proxy_dat <- proxy_snps[i,]
loc <- proxy_dat$proxy
message(loc)
out <- file.path("data/COVID19_GWAS",loc,paste0(loc,"_COVID19.tsv.gz"))
if(!file.exists(out) & force_new_subset==F){
min_POS <- proxy_dat$base_pair_location - bp_distance
max_POS <- proxy_dat$base_pair_location + bp_distance
dat <- subset(DAT,
CHR==proxy_dat$chromosome &
POS>=min_POS & POS<=max_POS)
dat$Locus <- loc
# Get RSIDs
dat$CHR <- gsub(23,"X",dat$CHR )
dat <- catalogueR::rsids_from_coords(dat, genome_build = "hg38")
dat <- dplyr::rename(dat, SNP=RefSNP_id)
dir.create(dirname(out), showWarnings = F, recursive = T)
data.table::fwrite(dat, out, sep="\t")
}
return(out)
}) %>% unlist()
{
# sumstats_paths <- list.files(file.path(root,"data/COVID19_GWAS"),
# full.names = T, recursive = T)
sumstats_paths <- setNames(sumstats_paths, basename(dirname(sumstats_paths)))
}We will need to rerun our previous eQTL_Catalogue.query in the COVID19_PheWAS step, as this only queried the lead pheWAS SNPs. For colocalization analyses, we need all the SNPs in each locus.
Exactly how you define a locus can vary, but here we use the simple approach of pulling all SNP +/- 1Mb from the lead SNP in each locus (2Mb windows).
In particular, we’re interested in immune stimulation studies, which we can limit our query to with qtl_search.
n_cores <- parallel::detectCores()
# data.table::getDTthreads()
data.table::setDTthreads(threads = 1)
# options(url.method = "curl")
# options(download.file.method="curl")
GWAS.QTL <- catalogueR::eQTL_Catalogue.query(sumstats_paths = sumstats_paths,
output_dir = "GWAS_queries",
# qtl_search = c("Fairfax_2014",
# "Alasoo_2018",
# "Nedelec_2016",
# "Quach_2016"),
genome_build = "hg38",
force_new_subset = F,
merge_with_gwas = T,
# Have to write to files
# to use in coloc (currently)
split_files = T,
multithread_tabix = F,
conda_env = NULL,
progress_bar = F,
nThread = 30)
gwas.qtl_path <- list.files("GWAS_queries",pattern = ".tsv.gz",
full.names = T, recursive = T)
# file.remove(gwas.qtl_path[file.size(gwas.qtl_path)<80])
# startsWith(basename(gwas.qtl_path), paste(highlight_loci, collapse = "|"))
GWAS.QTL <- catalogueR::gather_files(gwas.qtl_path, nThread = 6)
data.table::fwrite(GWAS.QTL,"GWAS_queries/eQTL_Catalogue.tsv.gz")Perform more robust colocalization analyses between COVID19 GWAS and eQTLs to identify shared genetic signals.
To reduce the number of tests, we only run colocalization on loci which the lead SNPs from both the pheWAS and eQTL were genome-wide significant.
# only run coloc on the loci whose proxy GWAS SNps had overlap with eSNPs.
gwas.qtl_path.sub <- grep(paste(proxy_snps$proxy,collapse="|"),gwas.qtl_path, value = T)
gwas.qtl_path.sub <- grep(paste(gwas.qtl_path.sub,collapse="|"),gwas.qtl_path.sub, value = T)
coloc_QTLs <- catalogueR::run_coloc(gwas.qtl_paths = gwas.qtl_path.sub,
nThread = 1,
top_snp_only = T,
save_path = "GWAS_queries/COVID19_GWAS.coloc.tsv.gz",
gwas_sample_size = N)# coloc_QTLs.grouped <- coloc_QTLs %>%
# dplyr::group_by(Locus.GWAS, qtl_id, gene.QTL) %>%
# dplyr::slice_max(SNP.PP.H4, with_ties = F) %>%
# data.table::data.table()
coloc_QTLs <- data.table::fread("data/COVID19_GWAS.coloc.top_snps.tsv.gz")
# coloc_QTLs <- catalogueR::eQTL_Catalogue.annotate_tissues(coloc_QTLs)
coloc_plot <- catalogueR::plot_coloc_summary(coloc_QTLs = coloc_QTLs,
show_plot = T,
PP_thresh = .8)ggplot2::ggsave("plots/coloc.all_datasets.pdf",coloc_plot,
dpi = 300, height = 10, width = 8)utils::sessionInfo()## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_1.0.4 catalogueR_0.1.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 highr_0.8 cellranger_1.1.0 pillar_1.4.7
## [5] bslib_0.2.4 compiler_4.0.3 jquerylib_0.1.3 R.methodsS3_1.8.1
## [9] R.utils_2.10.1 tools_4.0.3 digest_0.6.27 jsonlite_1.7.2
## [13] evaluate_0.14 lifecycle_1.0.0 tibble_3.0.6 gtable_0.3.0
## [17] pkgconfig_2.0.3 rlang_0.4.10 DBI_1.1.1 crosstalk_1.1.1
## [21] yaml_2.2.1 xfun_0.21 stringr_1.4.0 knitr_1.31
## [25] htmlwidgets_1.5.3 generics_0.1.0 vctrs_0.3.6 sass_0.3.1
## [29] DT_0.17 grid_4.0.3 tidyselect_1.1.0 glue_1.4.2
## [33] data.table_1.13.6 R6_2.5.0 readxl_1.3.1 rmarkdown_2.7
## [37] farver_2.0.3 tidyr_1.1.2 purrr_0.3.4 ggplot2_3.3.3
## [41] magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.1 htmltools_0.5.1.1
## [45] assertthat_0.2.1 colorspace_2.0-0 labeling_0.4.2 stringi_1.5.3
## [49] munsell_0.5.0 crayon_1.4.1 R.oo_1.24.0